First, we are going to bring in two datasets. One is on CEO compensation for the IT industry (2019). The second is a reduced dataset about the sales price of tractors

CEO <- read_excel("../data/CEOCompIT.xlsx")
Tractor <- read_excel("../data/TractorBivariate.xlsx")

CEO Compensation

Summary Statistics

summary(CEO)
##    Company              Name           Total Compensation
##  Length:336         Length:336         Min.   :       0  
##  Class :character   Class :character   1st Qu.: 2158707  
##  Mode  :character   Mode  :character   Median : 4869462  
##                                        Mean   : 7247166  
##                                        3rd Qu.: 9628200  
##                                        Max.   :66935100
sd(CEO$`Total Compensation`)
## [1] 8027321

Upgraded Summary Statistics

CEO%>% select(`Total Compensation`) %>% 
  tbl_summary(statistic = list(all_continuous() ~ c("{mean} ({sd})",
                                                    "{median} ({p25}, {p75})",
                                                    "{min}, {max}"),
                              all_categorical() ~ "{n} / {N} ({p}%)"),
              type = all_continuous() ~ "continuous2")
Characteristic N = 336
Total Compensation
Mean (SD) 7,247,166 (8,027,321)
Median (IQR) 4,869,462 (2,158,707, 9,628,200)
Range 0, 66,935,100

Histogram

hist(CEO$`Total Compensation`, breaks = 'fd')

Upgraded Histogram

CEO$CompensationMillions <- CEO$`Total Compensation`/1000000
binsize <- 2*IQR(CEO$CompensationMillions)/length(CEO$CompensationMillions)^(1/3)

ggplot(CEO, aes(CompensationMillions))+ 
  geom_histogram(binwidth = binsize,col = 'black', fill = 'darkblue', alpha = 0.5)+
  labs(title = 'Distribution of CEO Compensation in IT Industry 2019', caption = "(SourceAFL-CIO)",x = 'Compensation [millions]', y= 'Frequency')+
  theme_bw()

Boxplot

boxplot(CEO$CompensationMillions)

Upgraded Boxplot

plot_ly(y = CEO$CompensationMillions, type = 'box', name = 'Compensation [millions]',text = paste(CEO$Company,"-", CEO$Name)) %>% 
  layout(title = 'Distribution of CEO Compensation in IT Industry 2019')
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Tractor

Summary Statistics

Histogram

Boxplot

Scatterplot and Correlation

plot(Tractor$HorsePower, Tractor$`Sales Price`)

cor(Tractor$`Sales Price`,Tractor$HorsePower)
## [1] 0.6643812

Upgraded Scatterplot

ggplot(Tractor,aes(HorsePower, `Sales Price`))+
  geom_point(color = 'gray40')+
  geom_smooth(method = "lm", se = F, color = 'darkblue')+
  theme_bw()+
  labs(title = "Relationship between Horsepower and Sales Price of Tractors")